AMDGPU: Directly use v2bf16 as register type for bf16 vectors.#175761
Merged
AMDGPU: Directly use v2bf16 as register type for bf16 vectors.#175761
Conversation
Contributor
Author
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Member
|
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesPreviously we were casting v2bf16 to i32, unlike the f16 case. Simplify Patch is 184.57 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/175761.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 139e2d101a077..61c75e1c107ed 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1111,11 +1111,8 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
if (Size == 16) {
- if (Subtarget->has16BitInsts()) {
- if (VT.isInteger())
- return MVT::v2i16;
- return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
- }
+ if (Subtarget->has16BitInsts())
+ return MVT::getVectorVT(ScalarVT.getSimpleVT(), 2);
return VT.isInteger() ? MVT::i32 : MVT::f32;
}
@@ -1168,7 +1165,7 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
// inconsistent.
if (Size == 16 && Subtarget->has16BitInsts()) {
if (ScalarVT == MVT::bf16) {
- RegisterVT = MVT::i32;
+ RegisterVT = MVT::v2bf16;;
IntermediateVT = MVT::v2bf16;
} else {
RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll
index 3206f8e55f44e..0213bb35cc3a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslate-bf16.ll
@@ -8,24 +8,20 @@ define <3 x bfloat> @v3bf16(<3 x bfloat> %arg0) {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
- ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
- ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
- ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
- ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
- ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
- ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<3 x s16>) = G_TRUNC [[BUILD_VECTOR]](<3 x s32>)
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<4 x s16>)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16)
; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16)
- ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<3 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<3 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2)
+ ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<3 x s16>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<3 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2)
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<3 x s16>)
- ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
- ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
- ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT4]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT5]](s32)
+ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[UV4]](s16), [[UV5]](s16), [[UV6]](s16), [[DEF]](s16)
+ ; GFX9-NEXT: [[UV7:%[0-9]+]]:_(<2 x s16>), [[UV8:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<4 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV7]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV8]](<2 x s16>)
; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
%res = shufflevector <3 x bfloat> %arg0, <3 x bfloat> zeroinitializer, <3 x i32> <i32 3, i32 1, i32 2>
ret <3 x bfloat> %res
@@ -36,24 +32,15 @@ define <4 x bfloat> @v4bf16(<4 x bfloat> %arg0) {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
- ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
- ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
- ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
- ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
- ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32)
- ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s16>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>)
; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
- ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
- ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<4 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0)
- ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<4 x s16>)
- ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
- ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
- ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT4]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT5]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+ ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s16>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<4 x s16>), [[BUILD_VECTOR]], shufflemask(3, 1, 2, 0)
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[SHUF]](<4 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
%res = shufflevector <4 x bfloat> %arg0, <4 x bfloat> zeroinitializer, <4 x i32> <i32 3, i32 1, i32 2, i32 0>
ret <4 x bfloat> %res
@@ -64,30 +51,22 @@ define <5 x bfloat> @v5bf16(<5 x bfloat> %arg0) {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
- ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
- ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
- ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
- ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
- ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
- ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
- ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
- ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32)
- ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<5 x s16>) = G_TRUNC [[BUILD_VECTOR]](<5 x s32>)
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>)
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<6 x s16>)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16)
; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<5 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
- ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<5 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<5 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4)
+ ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<5 x s16>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<5 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4)
; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<5 x s16>)
- ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
- ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
- ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
- ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT6]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT7]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT8]](s32)
+ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[UV6]](s16), [[UV7]](s16), [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[DEF]](s16)
+ ; GFX9-NEXT: [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<6 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV11]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV12]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV13]](<2 x s16>)
; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%res = shufflevector <5 x bfloat> %arg0, <5 x bfloat> zeroinitializer, <5 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4>
ret <5 x bfloat> %res
@@ -98,30 +77,17 @@ define <6 x bfloat> @v6bf16(<6 x bfloat> %arg0) {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
- ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
- ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
- ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
- ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
- ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
- ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
- ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
- ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32)
- ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<6 x s16>) = G_TRUNC [[BUILD_VECTOR]](<6 x s32>)
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>)
; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
- ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
- ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<6 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<6 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5)
- ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16), [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<6 x s16>)
- ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
- ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
- ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
- ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT6]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT7]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT8]](s32)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<6 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
+ ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<6 x s16>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<6 x s16>), [[BUILD_VECTOR]], shufflemask(3, 1, 2, 0, 4, 5)
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[SHUF]](<6 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV2]](<2 x s16>)
; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
%res = shufflevector <6 x bfloat> %arg0, <6 x bfloat> zeroinitializer, <6 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5>
ret <6 x bfloat> %res
@@ -132,36 +98,24 @@ define <7 x bfloat> @v7bf16(<7 x bfloat> %arg0) {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
- ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
- ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
- ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
- ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
- ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
- ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
- ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
- ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
- ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32)
- ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
- ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32)
- ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<7 x s16>) = G_TRUNC [[BUILD_VECTOR]](<7 x s32>)
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr3
+ ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<2 x s16>), [[COPY1]](<2 x s16>), [[COPY2]](<2 x s16>), [[COPY3]](<2 x s16>)
+ ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<8 x s16>)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<7 x s16>) = G_BUILD_VECTOR [[UV]](s16), [[UV1]](s16), [[UV2]](s16), [[UV3]](s16), [[UV4]](s16), [[UV5]](s16), [[UV6]](s16)
; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT bfloat 0xR0000
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<7 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
- ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<7 x s16>) = G_SHUFFLE_VECTOR [[TRUNC]](<7 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5, 6)
+ ; GFX9-NEXT: [[SHUF:%[0-9]+]]:_(<7 x s16>) = G_SHUFFLE_VECTOR [[BUILD_VECTOR]](<7 x s16>), [[BUILD_VECTOR1]], shufflemask(3, 1, 2, 0, 4, 5, 6)
; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s16), [[UV9:%[0-9]+]]:_(s16), [[UV10:%[0-9]+]]:_(s16), [[UV11:%[0-9]+]]:_(s16), [[UV12:%[0-9]+]]:_(s16), [[UV13:%[0-9]+]]:_(s16), [[UV14:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SHUF]](<7 x s16>)
- ; GFX9-NEXT: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s16)
- ; GFX9-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s16)
- ; GFX9-NEXT: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s16)
- ; GFX9-NEXT: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s16)
- ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT8]](s32)
- ; GFX9-NEXT: $vgpr1 = COPY [[ANYEXT9]](s32)
- ; GFX9-NEXT: $vgpr2 = COPY [[ANYEXT10]](s32)
- ; GFX9-NEXT: $vgpr3 = COPY [[ANYEXT11]](s32)
+ ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[UV8]](s16), [[UV9]](s16), [[UV10]](s16), [[UV11]](s16), [[UV12]](s16), [[UV13]](s16), [[UV14]](s16), [[DEF]](s16)
+ ; GFX9-NEXT: [[UV15:%[0-9]+]]:_(<2 x s16>), [[UV16:%[0-9]+]]:_(<2 x s16>), [[UV17:%[0-9]+]]:_(<2 x s16>), [[UV18:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[BUILD_VECTOR2]](<8 x s16>)
+ ; GFX9-NEXT: $vgpr0 = COPY [[UV15]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr1 = COPY [[UV16]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr2 = COPY [[UV17]](<2 x s16>)
+ ; GFX9-NEXT: $vgpr3 = COPY [[UV18]](<2 x s16>)
; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3
%res = shufflevector <7 x bfloat> %arg0, <7 x bfloat> zeroinitializer, <7 x i32> <i32 3, i32 1, i32 2, i32 0, i32 4, i32 5, i32 6>
ret <7 x bfloat> %res
@@ -172,36 +126,19 @@ define <8 x bfloat> @v8bf16(<8 x bfloat> %arg0) {
; GFX9: bb.1 (%ir-block.0):
; GFX9-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
; GFX9-NEXT: {{ $}}
- ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
- ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
- ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](s32)
- ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s16)
- ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s16)
- ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY1]](s32)
- ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s16)
- ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s16)
- ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY2]](s32)
- ; GFX9-NEXT: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s16)
- ; GFX9-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s16)
- ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY3]](s32)
- ; GFX9-NEXT: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s16)
- ; GFX9-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s16)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32), [[ANYEXT3]](s32), [[ANYEXT4]](s32), [[ANYEXT5]](s32), [[ANYEXT6]](s32), [[ANYEXT7]](s32)
- ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s16>) = G_TRUNC [[BUILD_VECTOR]](<8 x s32>)
+ ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9...
[truncated]
|
Previously we were casting v2bf16 to i32, unlike the f16 case. Simplify this by using the natural vector type. This is probably a leftover from before v2bf16 was treated as legal. This is preparation for fixing a miscompile in globalisel.
c727dab to
e6bb35b
Compare
This was referenced Jan 13, 2026
shiltian
approved these changes
Jan 13, 2026
Contributor
|
I think we also need to remove using |
Priyanshu3820
pushed a commit
to Priyanshu3820/llvm-project
that referenced
this pull request
Jan 18, 2026
…175761) Previously we were casting v2bf16 to i32, unlike the f16 case. Simplify this by using the natural vector type. This is probably a leftover from before v2bf16 was treated as legal. This is preparation for fixing a miscompile in globalisel.
BStott6
pushed a commit
to BStott6/llvm-project
that referenced
this pull request
Jan 22, 2026
…175761) Previously we were casting v2bf16 to i32, unlike the f16 case. Simplify this by using the natural vector type. This is probably a leftover from before v2bf16 was treated as legal. This is preparation for fixing a miscompile in globalisel.
This was referenced Jan 22, 2026
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.

Previously we were casting v2bf16 to i32, unlike the f16 case. Simplify
this by using the natural vector type. This is probably a leftover from
before v2bf16 was treated as legal. This is preparation for fixing a
miscompile in globalisel.